This document provides technical supporting information for the AIRBNB listing data report analysis in Hawaii, United States. This report summarizes and communicates the results of AIRBNB rooms features (# of guests, bedrooms, type of property, etc) and character (superhost, seasonality, location etc) listings that affects the pricing and occupancy.
Data provided was 23000 listings in Hawaii from 2008 to 2019.
library(tidyverse)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(esquisse)
library(inspectdf)
library(tidyr)
library(DataExplorer)
library(janitor)
library(MultinomialCI)
library(scales)
library(GGally)
library(knitr)
#load data
bnb <- read.csv("listings_hw.csv")
host <- bnb %>%
dplyr::select(23, 26, 27, 29, 34, 81, 83:93, 97:99, 39:46, 49:67, -59)
head(host)
## host_since host_response_time host_response_rate host_is_superhost
## 1 2008-09-30 within a few hours 90% f
## 2 2009-01-31 within a few hours 100% f
## 3 2009-02-09 within an hour 90% f
## 4 2009-02-13 within an hour 100% f
## 5 2009-02-13 within an hour 100% f
## 6 2009-02-14 within a few hours 100% f
## host_total_listings_count availability_365 number_of_reviews
## 1 1 35 234
## 2 2 363 37
## 3 7 333 5
## 4 3 286 159
## 5 3 256 15
## 6 2 116 121
## number_of_reviews_ltm first_review last_review review_scores_rating
## 1 20 2010-07-11 2019-08-28 95
## 2 3 2013-02-18 2019-08-19 91
## 3 3 2011-05-31 2019-09-04 75
## 4 16 2010-02-16 2019-07-15 92
## 5 4 2013-11-16 2019-08-17 100
## 6 37 2009-05-07 2019-08-27 88
## review_scores_accuracy review_scores_cleanliness review_scores_checkin
## 1 10 9 10
## 2 9 9 10
## 3 10 7 10
## 4 9 9 10
## 5 10 10 10
## 6 10 9 10
## review_scores_communication review_scores_location review_scores_value
## 1 10 10 9
## 2 9 9 9
## 3 10 10 10
## 4 10 10 9
## 5 10 10 10
## 6 10 10 9
## instant_bookable is_business_travel_ready cancellation_policy
## 1 f f strict_14_with_grace_period
## 2 f f moderate
## 3 t f strict_14_with_grace_period
## 4 t f strict_14_with_grace_period
## 5 f f flexible
## 6 f f strict_14_with_grace_period
## neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed
## 1 Oʻahu Primary Urban Center Honolulu
## 2 Hamakua Coast Hamakua Hawaii
## 3 Island of Hawaiʻi South Kohala Hawaii
## 4 Island of Hawaiʻi South Kona Hawaii
## 5 Kailua/Kona North Kona Hawaii
## 6 Kauaʻi Koloa-Poipu Kauai
## city state zipcode market smart_location latitude longitude
## 1 Honolulu HI 96815 Oahu Honolulu, HI 21.27603 -157.8221
## 2 Honokaa HI 96727 The Big Island Honokaa, HI 20.04095 -155.4325
## 3 Kamuela HI 96743 The Big Island Kamuela, HI 20.02621 -155.7014
## 4 Captain Cook HI 96704 The Big Island Captain Cook, HI 19.43119 -155.8808
## 5 Kailua Kona HI 96740 The Big Island Kailua Kona, HI 19.56413 -155.9635
## 6 Koloa HI 96756 Kauai Koloa, HI 21.88305 -159.4737
## is_location_exact property_type room_type accommodates bathrooms
## 1 t Condominium Private room 2 1
## 2 t Bed and breakfast Entire home/apt 2 1
## 3 f Apartment Entire home/apt 2 1
## 4 t Apartment Entire home/apt 2 1
## 5 t House Entire home/apt 6 3
## 6 t Condominium Entire home/apt 4 1
## bedrooms beds bed_type square_feet price weekly_price monthly_price
## 1 1 1 Futon NA $69.00
## 2 0 1 Real Bed 400 $85.00
## 3 1 0 Real Bed NA $97.00 $610.00
## 4 1 1 Real Bed 650 $92.00 $640.00 $2,390.00
## 5 3 4 Real Bed 2600 $299.00 $2,780.00 $8,500.00
## 6 NA 1 Real Bed NA $92.00 $602.00
## security_deposit cleaning_fee guests_included extra_people
## 1 $45.00 1 $10.00
## 2 2 $50.00
## 3 $0.00 $50.00 2 $10.00
## 4 $0.00 $75.00 2 $15.00
## 5 $300.00 $245.00 2 $30.00
## 6 $50.00 1 $0.00
description and observation of the data:
summary(host)
## host_since host_response_time host_response_rate
## 2015-12-10: 287 : 1 100% :14242
## 2012-04-16: 277 a few days or more: 367 99% : 1148
## 2017-01-16: 253 N/A : 1148 N/A : 1148
## 2017-09-20: 241 within a day : 1591 90% : 944
## 2016-08-30: 237 within a few hours: 3791 98% : 918
## 2014-07-20: 226 within an hour :16554 95% : 722
## (Other) :21931 (Other): 4330
## host_is_superhost host_total_listings_count availability_365 number_of_reviews
## : 1 Min. : 0.00 Min. : 0 Min. : 0.0
## f:15459 1st Qu.: 2.00 1st Qu.:127 1st Qu.: 1.0
## t: 7992 Median : 8.00 Median :215 Median : 6.0
## Mean : 70.47 Mean :199 Mean : 24.6
## 3rd Qu.: 64.00 3rd Qu.:281 3rd Qu.: 29.0
## Max. :1717.00 Max. :365 Max. :653.0
## NA's :1
## number_of_reviews_ltm first_review last_review
## Min. : 0.00 : 5602 : 5602
## 1st Qu.: 0.00 2018-12-31: 59 2019-08-26: 447
## Median : 3.00 2018-12-28: 45 2019-09-02: 440
## Mean : 10.35 2019-05-27: 41 2019-08-27: 402
## 3rd Qu.: 14.00 2017-12-27: 40 2019-09-08: 396
## Max. :283.00 2017-12-31: 38 2019-08-25: 395
## (Other) :17627 (Other) :15770
## review_scores_rating review_scores_accuracy review_scores_cleanliness
## Min. : 20.00 Min. : 2.00 Min. : 2.000
## 1st Qu.: 93.00 1st Qu.:10.00 1st Qu.: 9.000
## Median : 97.00 Median :10.00 Median :10.000
## Mean : 94.73 Mean : 9.68 Mean : 9.512
## 3rd Qu.:100.00 3rd Qu.:10.00 3rd Qu.:10.000
## Max. :100.00 Max. :10.00 Max. :10.000
## NA's :5774 NA's :5777 NA's :5776
## review_scores_checkin review_scores_communication review_scores_location
## Min. : 2.000 Min. : 2.00 Min. : 2.000
## 1st Qu.:10.000 1st Qu.:10.00 1st Qu.:10.000
## Median :10.000 Median :10.00 Median :10.000
## Mean : 9.782 Mean : 9.74 Mean : 9.798
## 3rd Qu.:10.000 3rd Qu.:10.00 3rd Qu.:10.000
## Max. :10.000 Max. :10.00 Max. :10.000
## NA's :5783 NA's :5778 NA's :5784
## review_scores_value instant_bookable is_business_travel_ready
## Min. : 2.000 f: 7517 f:23452
## 1st Qu.: 9.000 t:15935
## Median :10.000
## Mean : 9.434
## 3rd Qu.:10.000
## Max. :10.000
## NA's :5785
## cancellation_policy neighbourhood
## strict_14_with_grace_period:12282 Island of Hawaiʻi:4289
## super_strict_60 : 4240 Maui :3737
## moderate : 2677 Kauaʻi :3141
## flexible : 2260 Oʻahu :2394
## super_strict_30 : 1915 Honolulu :1680
## luxury_moderate : 45 Kihei/Wailea :1634
## (Other) : 33 (Other) :6577
## neighbourhood_cleansed neighbourhood_group_cleansed
## Primary Urban Center:4604 Hawaii :5644
## Kihei-Makena :3931 Honolulu:6924
## Lahaina :3173 Kauai :3141
## North Kona :2207 Maui :7743
## North Shore Kauai :1312
## Puna :1149
## (Other) :7076
## city state zipcode market
## Honolulu :4622 : 17 96815 :3684 Maui :7669
## Kihei :2951 FL: 1 96753 :3532 Oahu :6921
## Lahaina :2850 Hi: 20 96761 :3100 The Big Island :5592
## Kailua-Kona:1428 HI:23413 96740 :2056 Kauai :3136
## Princeville:1133 US: 1 96722 :1090 Other (Domestic): 84
## Koloa : 811 96756 : 961 Big Island : 36
## (Other) :9657 (Other):9029 (Other) : 14
## smart_location latitude longitude is_location_exact
## Honolulu, HI :4618 Min. :18.92 Min. :-159.7 f: 8314
## Kihei, HI :2941 1st Qu.:20.69 1st Qu.:-157.8 t:15138
## Lahaina, HI :2850 Median :20.95 Median :-156.7
## Kailua-Kona, HI:1428 Mean :20.87 Mean :-157.1
## Princeville, HI:1132 3rd Qu.:21.29 3rd Qu.:-156.3
## Koloa, HI : 811 Max. :22.23 Max. :-154.8
## (Other) :9672
## property_type room_type accommodates bathrooms
## Condominium:11222 Entire home/apt:20604 Min. : 1.000 Min. : 0.000
## House : 4216 Hotel room : 302 1st Qu.: 3.000 1st Qu.: 1.000
## Apartment : 3058 Private room : 2436 Median : 4.000 Median : 1.000
## Guest suite: 763 Shared room : 110 Mean : 4.574 Mean : 1.602
## Villa : 559 3rd Qu.: 6.000 3rd Qu.: 2.000
## Townhouse : 518 Max. :35.000 Max. :14.000
## (Other) : 3116 NA's :7
## bedrooms beds bed_type square_feet
## Min. : 0.000 Min. : 0.000 Airbed : 10 Min. : 0
## 1st Qu.: 1.000 1st Qu.: 1.000 Couch : 4 1st Qu.: 500
## Median : 1.000 Median : 2.000 Futon : 18 Median : 792
## Mean : 1.544 Mean : 2.365 Pull-out Sofa: 27 Mean : 1530
## 3rd Qu.: 2.000 3rd Qu.: 3.000 Real Bed :23393 3rd Qu.: 1200
## Max. :16.000 Max. :38.000 Max. :80000
## NA's :15 NA's :63 NA's :23287
## price weekly_price monthly_price security_deposit
## $150.00: 445 :22112 :22300 $0.00 :9034
## $99.00 : 425 $600.00 : 41 $3,000.00: 48 :4302
## $125.00: 373 $1,000.00: 38 $2,500.00: 37 $500.00:2099
## $100.00: 346 $700.00 : 33 $2,000.00: 33 $200.00:1719
## $199.00: 338 $900.00 : 31 $1,500.00: 27 $300.00:1419
## $250.00: 309 $650.00 : 30 $2,400.00: 27 $100.00:1210
## (Other):21216 (Other) : 1167 (Other) : 980 (Other):3669
## cleaning_fee guests_included extra_people
## : 1637 Min. : 1.000 $0.00 :15653
## $0.00 : 1511 1st Qu.: 1.000 $10.00 : 1649
## $150.00: 1371 Median : 1.000 $20.00 : 1273
## $100.00: 1185 Mean : 2.317 $25.00 : 1162
## $125.00: 881 3rd Qu.: 3.000 $15.00 : 861
## $75.00 : 649 Max. :34.000 $50.00 : 586
## (Other):16218 (Other): 2268
tidying up data: + has to convert price, weekly_price, monthly_price, security_deposit, cleaning_fee, extra_people into numeric data. the value of these columns includes dollar sign. stripping the symbols using gsub()
# converting strings to numberic type
host$price <- as.numeric(gsub("\\$", "", host$price))
## Warning: NAs introduced by coercion
host$weekly_price <- as.numeric(gsub("\\$", "", host$weekly_price))
## Warning: NAs introduced by coercion
host$monthly_price <- as.numeric(gsub("\\$", "", host$monthly_price))
## Warning: NAs introduced by coercion
host$security_deposit <- as.numeric(gsub("\\$", "", host$security_deposit))
## Warning: NAs introduced by coercion
host$cleaning_fee <- as.numeric(gsub("\\$", "", host$cleaning_fee))
## Warning: NAs introduced by coercion
host$extra_people <- as.numeric(gsub("\\$", "", host$extra_people))
host$host_response_rate <- as.numeric(gsub("%", "", host$host_response_rate))
## Warning: NAs introduced by coercion
# converting date to year only; to calculate host_since and active hosting long.
host <- host %>%
separate(host_since, c("host_since_year"), sep = "[-]") %>%
dplyr::mutate(host_since_long = 2019 - as.numeric(host_since_year))
## Warning: Expected 1 pieces. Additional pieces discarded in 23451 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
host <- host %>%
separate(first_review, c("first_review_year"), sep = "[-]") %>%
dplyr::mutate(first_review_long = 2019 - as.numeric(first_review_year))
## Warning: Expected 1 pieces. Additional pieces discarded in 17850 rows [1, 2, 3,
## 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
# to mark rents in which never been successfully rented by NAval.
host$NAval <- host$first_review_long
host$NAval <- ifelse((is.na(host$first_review_long)), 0 , host$NAval)
host$NAval <- ifelse((host$first_review_long > 0 ), 1 , host$NAval)
host$NAprice <- host$price
host$NAprice <- ifelse((is.na(host$price)), 0, host$NAprice)
host$NAprice <- ifelse((host$price > 0), 1, host$NAprice)
#creating DF1 for non-NA val.
df1 <- host %>%
filter(NAval != 0 & host_is_superhost != "" & NAprice != 0)
df1$HostStatus <- df1$host_is_superhost
df1$HostStatus <- ifelse((df1$host_is_superhost == "t"), "Superhost" , df1$HostStatus)
df1$HostStatus <- ifelse((df1$host_is_superhost == "f" ), "Host", df1$HostStatus)
host <- host %>%
separate(last_review, c("last_review_year"), sep = "[-]") %>%
mutate(last_review_long = 2019 - as.numeric(last_review_year),
active_hosting_long = as.numeric(last_review_year) - as.numeric(first_review_year),
avg_review_per_actv_year = number_of_reviews/active_hosting_long,
availability_rate = availability_365/365,
avg_review_per_ava = avg_review_per_actv_year/availability_365)
## Warning: Expected 1 pieces. Additional pieces discarded in 17850 rows [1, 2, 3,
## 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
grid.arrange(
ggplot(host) +
aes(x = host_since_long) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "how long host joined Airbnb", y = "count") +
theme_minimal(),
ggplot(host) +
aes(x = active_hosting_long) +
geom_histogram(bins = 30L, fill = "#cb181d") +
theme_minimal() +
labs (x= "active customer review period"),
ncol = 1)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 5602 rows containing non-finite values (stat_bin).
host %>%
inspect_na() %>%
show_plot()
host %>%
inspect_cat() %>%
show_plot()
CATEGORICAL UNI-VARIATE GRAPHICAL:
grid.arrange(
ggplot(data = host, mapping = aes( x = host_is_superhost)) +
geom_bar(),
ggplot(data = host, mapping = aes( x = host_response_time)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = cancellation_policy)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = room_type)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = neighbourhood_group_cleansed)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = is_location_exact)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = instant_bookable)) +
geom_bar(),
ggplot(data = bnb, mapping = aes( x = bed_type)) +
geom_bar(),
ggplot(data = host, mapping = aes( x = NAval)) +
geom_bar(),
ncol = 3)
## Warning: Removed 5602 rows containing non-finite values (stat_count).
NUMERICAL UNIVARIATE GRAPHICAL
# Total scores to sum the criteria of all score criteria
host <- host %>%
mutate(total_scores = review_scores_accuracy + review_scores_checkin + review_scores_cleanliness + review_scores_communication + review_scores_location + review_scores_value)
host %>%
dplyr::select(11:17) %>%
inspect_num() %>%
show_plot()
host %>%
dplyr:: select(1, 3, 5:7, 47, 50:52 ) %>%
inspectdf:: inspect_num() %>%
show_plot()
host %>%
dplyr::select(34, 37, 45, 40:42) %>%
inspect_num() %>%
show_plot()
# checking the number of Superhost: Host in percentage
bnb %>%
filter(host_is_superhost != "") %>%
tabyl(host_is_superhost) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(rounding = "half up", digits = 2)
## host_is_superhost n percent Total
## 0.0000000 0.00% 0.00%
## f 0.6591762 0.00% 65.92%
## t 0.3407812 0.00% 34.08%
## Total 0.9999574 0.00% 100.00%
host %>%
tabyl(neighbourhood_group_cleansed) %>%
adorn_totals(where = c("row", "col"))
## neighbourhood_group_cleansed n percent Total
## Hawaii 5644 0.2406618 5644.241
## Honolulu 6924 0.2952413 6924.295
## Kauai 3141 0.1339331 3141.134
## Maui 7743 0.3301637 7743.330
## Total 23452 1.0000000 23453.000
host %>%
tabyl(first_review_year) %>%
adorn_totals(where = c("row", "col"))
## first_review_year n percent Total
## 5602 2.388709e-01 5602.238871
## 2009 2 8.528057e-05 2.000085
## 2010 22 9.380863e-04 22.000938
## 2011 66 2.814259e-03 66.002814
## 2012 193 8.229575e-03 193.008230
## 2013 353 1.505202e-02 353.015052
## 2014 671 2.861163e-02 671.028612
## 2015 1273 5.428108e-02 1273.054281
## 2016 2405 1.025499e-01 2405.102550
## 2017 3520 1.500938e-01 3520.150094
## 2018 4797 2.045455e-01 4797.204545
## 2019 4548 1.939280e-01 4548.193928
## Total 23452 1.000000e+00 23453.000000
host %>%
tabyl( neighbourhood_group_cleansed, host_is_superhost) %>%
adorn_totals(where = c("row", "col"))
## neighbourhood_group_cleansed V1 f t Total
## Hawaii 0 3365 2279 5644
## Honolulu 0 4653 2271 6924
## Kauai 0 2207 934 3141
## Maui 1 5234 2508 7743
## Total 1 15459 7992 23452
# rolling sum of number of host
host %>%
filter(host_since_year != "") %>%
group_by(host_since_year) %>%
summarize(count = n()) %>%
mutate(sum_host = cumsum(count))
## # A tibble: 12 x 3
## host_since_year count sum_host
## <chr> <int> <int>
## 1 2008 29 29
## 2 2009 137 166
## 3 2010 186 352
## 4 2011 818 1170
## 5 2012 1252 2422
## 6 2013 1423 3845
## 7 2014 2694 6539
## 8 2015 4145 10684
## 9 2016 4675 15359
## 10 2017 4491 19850
## 11 2018 2390 22240
## 12 2019 1211 23451
df1 %>%
filter(df1$square_feet !="") %>%
dplyr::select(host_response_rate,host_total_listings_count, availability_365, number_of_reviews, latitude, longitude, accommodates, price, square_feet, guests_included, host_since_long) %>%
cor() %>%
round(2) %>%
kable
| host_response_rate | host_total_listings_count | availability_365 | number_of_reviews | latitude | longitude | accommodates | price | square_feet | guests_included | host_since_long | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| host_response_rate | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| host_total_listings_count | NA | 1.00 | 0.09 | -0.09 | 0.14 | -0.12 | -0.03 | -0.01 | -0.12 | -0.13 | 0.00 |
| availability_365 | NA | 0.09 | 1.00 | 0.05 | -0.17 | 0.14 | 0.04 | 0.02 | 0.07 | -0.03 | 0.14 |
| number_of_reviews | NA | -0.09 | 0.05 | 1.00 | -0.11 | 0.10 | -0.22 | -0.37 | 0.03 | -0.06 | -0.01 |
| latitude | NA | 0.14 | -0.17 | -0.11 | 1.00 | -0.93 | -0.13 | 0.15 | 0.00 | 0.04 | 0.01 |
| longitude | NA | -0.12 | 0.14 | 0.10 | -0.93 | 1.00 | 0.11 | -0.11 | 0.01 | -0.05 | -0.01 |
| accommodates | NA | -0.03 | 0.04 | -0.22 | -0.13 | 0.11 | 1.00 | 0.63 | 0.51 | 0.56 | 0.03 |
| price | NA | -0.01 | 0.02 | -0.37 | 0.15 | -0.11 | 0.63 | 1.00 | 0.39 | 0.58 | -0.04 |
| square_feet | NA | -0.12 | 0.07 | 0.03 | 0.00 | 0.01 | 0.51 | 0.39 | 1.00 | 0.57 | -0.06 |
| guests_included | NA | -0.13 | -0.03 | -0.06 | 0.04 | -0.05 | 0.56 | 0.58 | 0.57 | 1.00 | -0.01 |
| host_since_long | NA | 0.00 | 0.14 | -0.01 | 0.01 | -0.01 | 0.03 | -0.04 | -0.06 | -0.01 | 1.00 |
df1 %>%
filter(HostStatus == "Superhost" & square_feet != "") %>%
dplyr::select(host_response_rate,host_total_listings_count, availability_365, number_of_reviews, latitude, longitude, accommodates, price, square_feet, guests_included, host_since_long) %>%
cor() %>%
round(2) %>%
kable
| host_response_rate | host_total_listings_count | availability_365 | number_of_reviews | latitude | longitude | accommodates | price | square_feet | guests_included | host_since_long | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| host_response_rate | 1.00 | 0.06 | 0.05 | 0.01 | -0.10 | 0.10 | -0.23 | -0.34 | -0.18 | -0.32 | -0.04 |
| host_total_listings_count | 0.06 | 1.00 | -0.16 | -0.19 | 0.36 | -0.38 | -0.10 | 0.10 | -0.19 | -0.19 | 0.10 |
| availability_365 | 0.05 | -0.16 | 1.00 | 0.11 | -0.29 | 0.29 | -0.03 | -0.28 | 0.06 | -0.21 | 0.08 |
| number_of_reviews | 0.01 | -0.19 | 0.11 | 1.00 | -0.26 | 0.27 | -0.11 | -0.46 | -0.07 | -0.05 | 0.01 |
| latitude | -0.10 | 0.36 | -0.29 | -0.26 | 1.00 | -0.94 | -0.18 | 0.35 | -0.26 | -0.05 | -0.09 |
| longitude | 0.10 | -0.38 | 0.29 | 0.27 | -0.94 | 1.00 | 0.14 | -0.28 | 0.27 | 0.04 | 0.12 |
| accommodates | -0.23 | -0.10 | -0.03 | -0.11 | -0.18 | 0.14 | 1.00 | 0.31 | 0.63 | 0.51 | 0.04 |
| price | -0.34 | 0.10 | -0.28 | -0.46 | 0.35 | -0.28 | 0.31 | 1.00 | 0.30 | 0.39 | 0.01 |
| square_feet | -0.18 | -0.19 | 0.06 | -0.07 | -0.26 | 0.27 | 0.63 | 0.30 | 1.00 | 0.53 | -0.10 |
| guests_included | -0.32 | -0.19 | -0.21 | -0.05 | -0.05 | 0.04 | 0.51 | 0.39 | 0.53 | 1.00 | 0.02 |
| host_since_long | -0.04 | 0.10 | 0.08 | 0.01 | -0.09 | 0.12 | 0.04 | 0.01 | -0.10 | 0.02 | 1.00 |
4A SUPERHOSTS distribution accross different variables
df1 <- df1 %>%
filter(!(host_response_time %in% "N/A"))
grid.arrange(
ggplot(df1) +
aes(x = host_response_time, fill = host_is_superhost) +
geom_bar(position = "dodge") +
scale_fill_hue() +
theme_minimal() + theme(legend.position = "none"),
ggplot(df1) +
aes(x = neighbourhood_group_cleansed, fill = host_is_superhost) +
geom_bar(position = "dodge") +
scale_fill_hue() +
theme_minimal() +
theme(legend.position = "none"),
ggplot(df1) +
aes(x = instant_bookable, fill = host_is_superhost) +
geom_bar(position = "dodge") +
scale_fill_hue() +
theme_minimal() +
theme(legend.position = "none"),
ggplot(df1) +
aes(x = cancellation_policy, fill = host_is_superhost) +
geom_bar(position = "dodge") +
scale_fill_hue() +
theme_minimal() +
theme(legend.position = "none"),
ncol = 2)
4B How about the price? Does nightly rate differs in different listing location?
ggplot(df1) +
aes(x = price, fill = host_is_superhost) +
geom_histogram(bins = 30L) +
scale_fill_hue() +
theme_minimal() +
theme(legend.position = "none") +
facet_wrap(vars(neighbourhood_group_cleansed))
grid.arrange(
ggplot(df1) +
aes(x = neighbourhood_group_cleansed, y = price, fill = neighbourhood_group_cleansed) +
geom_boxplot() +
scale_fill_hue() +
theme_minimal() + coord_flip()+ theme(legend.position = "none"),
ncol = 1)
df1 %>%
group_by(cancellation_policy, host_is_superhost) %>%
subset(cancellation_policy %in% c("super_strict_30", "flexible", "strict_14_with_grace_period")) %>%
filter(price != "" ) %>%
summarise(price_median = median(price)) %>%
ggplot(aes(x = reorder(cancellation_policy, price_median), y = price_median, fill = reorder(host_is_superhost, price_median))) +
geom_bar(stat = "identity", position = "dodge") + coord_flip() + theme_classic() + theme(legend.position="bottom")
df1 %>%
group_by(cancellation_policy, host_is_superhost, neighbourhood_group_cleansed) %>%
subset(cancellation_policy %in% c("super_strict_30", "flexible", "strict_14_with_grace_period")) %>%
filter(price != "" ) %>%
summarise(price_median = mean(price)) %>%
ggplot(aes(x = reorder(cancellation_policy, price_median), y = price_median, fill = reorder(host_is_superhost, price_median))) +
geom_bar(stat = "identity", position = "dodge") + coord_flip() + theme_classic() + theme(legend.position="bottom") + facet_wrap(~neighbourhood_group_cleansed)
df1 %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(avg_price = median(price)) %>%
ggplot(aes(x = reorder(neighbourhood_group_cleansed, -avg_price), y = avg_price, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") +
geom_hline(yintercept = 174, size = 0.3) + theme_classic() + theme(legend.position = "none")
df1 %>%
filter(square_feet != "") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(avg_accom = median(accommodates)) %>%
ggplot(aes(x = neighbourhood_group_cleansed, y = avg_accom, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") + theme(legend.position = "none")
df1 %>%
filter(square_feet != "") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(avg_sqft = median(square_feet)) %>%
ggplot(aes(x = neighbourhood_group_cleansed, y = avg_sqft, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") + theme(legend.position = "bottom")
df1 %>%
filter(review_scores_rating != "") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(review_scores_rating_mean = mean(review_scores_rating)) %>%
ggplot(aes(x = host_is_superhost, y = review_scores_rating_mean, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") + facet_wrap(~neighbourhood_group_cleansed) +
#geom_hline(yintercept = 1) +
theme(legend.position = "none")
df1 %>%
filter(number_of_reviews != "") %>%
group_by(host_is_superhost) %>%
summarise(number_of_reviews_ave = median(number_of_reviews)) %>%
ggplot(aes(x = host_is_superhost, y = number_of_reviews_ave, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") +
#geom_hline(yintercept = 1) +
theme(legend.position = "none")
df1 %>%
filter(number_of_reviews != "") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(number_of_reviews_ave = mean(number_of_reviews)) %>%
ggplot(aes(x = host_is_superhost, y = number_of_reviews_ave, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") + facet_wrap(~neighbourhood_group_cleansed) +
#geom_hline(yintercept = 1) +
theme(legend.position = "none")
df1 %>%
filter(number_of_reviews >= 3) %>%
group_by(HostStatus, review_scores_rating, host_response_rate) %>%
summarise(review_scores_avg = median(review_scores_rating),
response_rate_avg = mean(host_response_rate)) %>%
ggplot(aes(x = response_rate_avg, y = review_scores_avg, color = HostStatus)) +
geom_point(position=position_jitter(h=0.20,w=0.20), size = 1) +
#acet_wrap(~HostStatus) +
theme_classic() +
xlim(60,100) +
ylim(60,100) +
annotate("rect", xmin = 90, xmax = 100, ymin = 95, ymax = 100, fill = "yellow", alpha = 0.60) +
annotate("text", x = 95, y = 97.5, label = "Superhost", size = 3) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic")) +
labs(x = "Average Response Rate (in %)",
y = "Average Review Scores (Out of 100)",
title = "the Superhost status is not being fairly implemented",
subtitle = "Although hosts have hit 90% Response Rate and 96% Review Scores, \nHosts are yet to be awarded with the Superhost status, and vice versa") + scale_color_manual(values = c("#7584AD", "#FFB997"))
## Warning: Removed 129 rows containing missing values (geom_point).
ggsave("scores_rate.png", width = 7, height = 5)
## Warning: Removed 129 rows containing missing values (geom_point).
grid.arrange(
df1 %>%
group_by(host_since_year) %>%
summarize(count = n()) %>%
mutate(sum_host = cumsum(count)) %>% ggplot(aes(x = host_since_year, y = sum_host)) + geom_col() + theme_classic() + labs(title = "the increase rate of new host decrease over time"),
ncol =1)
df1%>%
group_by(neighbourhood_group_cleansed, host_since_year) %>%
summarize(count = n()) %>%
mutate(sum_host = cumsum(count)) %>%
ggplot(aes(x = host_since_year, y = sum_host, fill = neighbourhood_group_cleansed, group = neighbourhood_group_cleansed)) + geom_col() + theme_classic() + theme(legend.position=c(0.30, 0.70))
df1%>%
group_by(HostStatus, host_since_year) %>%
summarize(count = n()) %>%
mutate(sum_host = cumsum(count)) %>%
ggplot(aes(x = host_since_year, y = sum_host, color= HostStatus, group = HostStatus)) +
geom_line(size = 1) +
theme_classic() +
theme(legend.position=c(0.25, 0.50)) +
annotate("rect", xmin = 7, xmax = 10, ymin = 40, ymax = 8000, fill = "lightgreen", alpha = 0.15) +
labs(y = "Total Hosts",
x = "Year Joined",
title = "Growth of new Airbnb hosts in Hawaii accelerated from 2014 to 2017",
subtitle = )
df1%>%
dplyr::select(host_is_superhost, first_review_year, price, number_of_reviews) %>%
dplyr::group_by(first_review_year) %>%
summarize(ave_price = median(price),
med_rev = median(number_of_reviews)) %>%
ggplot(aes(x = first_review_year, y = ave_price))+
geom_point() +
theme_classic()
df1%>%
group_by(HostStatus, host_since_year) %>%
summarize(count = n()) %>%
mutate(sum_host = cumsum(count)) %>%
ggplot(aes(x = host_since_year, y = sum_host, fill = HostStatus, group = HostStatus)) +
geom_col() + theme_classic() + theme(legend.position=c(0.15, 0.70)) +
labs( x = "Year Joined",
y = "Cum. Sum of Hosts in Hawaii",
title = "Superhost vs. Host ratio",
subtitle = "Given that only 34% of hosts are Superhost") +
scale_fill_manual(values = c("#7584AD", "#FFB997"))
ggsave("superhost.png", width = 7, height = 5)
#median nightly rate in different area
df1 %>%
filter(neighbourhood_group_cleansed != "Kauai" & neighbourhood_group_cleansed !="Hawaii") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(avg_price = mean(price)) %>%
ggplot(aes(x = reorder(neighbourhood_group_cleansed, -avg_price), y = avg_price, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") +
geom_hline(yintercept = 174, size = 0.3) + theme_classic() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic")) +
scale_fill_manual(values = c("#7584AD", "#FFB997")) +
labs( x = "Location ",
y = "Average nightly rate ($)",
title = "Comparing average nightly rate of listings in Maui and Honolulu",
subtitle = "Maui offers significantly higher average nightly rate than Honolulu \n Regardless, regular hosts set higher listing price than Superhost",
caption = "Data Provided by Airbnb") +
geom_text(aes(x = neighbourhood_group_cleansed, y = avg_price, label = round(avg_price, digits = 2)),
position = position_dodge2(width = 1),
size = 3.5, vjust = 3.25, color = "white", fontface = "bold"
) +
scale_y_continuous(labels = dollar)
ggsave("aveprice_compare.png", width = 7, height = 5)
df1 %>%
filter(square_feet != "") %>%
summarize(med_p = median(number_of_reviews))
## med_p
## 1 50
df1 %>%
filter(neighbourhood_group_cleansed != "Kauai" & neighbourhood_group_cleansed !="Hawaii") %>%
group_by(host_is_superhost, neighbourhood_group_cleansed) %>%
summarise(avg_reviw = mean(number_of_reviews)) %>%
ggplot(aes(x = reorder(neighbourhood_group_cleansed, avg_reviw), y = avg_reviw, fill = host_is_superhost)) +
geom_bar(stat = "identity", position = "dodge") + theme_classic() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 11),
plot.subtitle = element_text(face = "italic", size = 10)) +
scale_fill_manual(values = c("#7584AD", "#FFB997")) +
geom_hline(yintercept = 50, size = 0.3, linetype ="dashed") +
labs(x = "Location",
y = "Average number of reviews",
title = "Comparing average number of review of property listing in Mauai and Honolulu",
subtitle = "Consistently, Superhosts seemed to have higher number of reviews, \nAlthough not a perfect parameter, Superhosts have better guests engagement, and predictively, occupancy. ",
caption = "Data provided by Airbnb.com") +
geom_text(aes(x = neighbourhood_group_cleansed, y = avg_reviw, label = round(avg_reviw, digits = 0)),
position = position_dodge2(width = 1),
size = 4, vjust = 3.25, color = "white", fontface = "bold"
) +
ggsave("numrev_compare.png", width = 7, height = 5)
a <- df1 %>%
group_by(HostStatus) %>%
summarise(count = n()) %>%
ggplot(aes(x = "", y = count, fill = HostStatus)) +
geom_bar(stat = "identity", width = 1)
a + coord_polar("y", start = 0) + theme_classic() + theme(legend.position = "bottom")
options(scipen = 10)
Running regression analysis to find which variable that causes most impact on price change.
fit_mod <- lm(price~ factor(host_is_superhost) + review_scores_rating + factor(neighbourhood_group_cleansed) + accommodates + host_since_long, data = df1)
summary(fit_mod)
##
## Call:
## lm(formula = price ~ factor(host_is_superhost) + review_scores_rating +
## factor(neighbourhood_group_cleansed) + accommodates + host_since_long,
## data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -679.34 -67.70 -19.94 34.34 900.21
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -98.8451 17.0050 -5.813
## factor(host_is_superhost)t -7.2786 2.3525 -3.094
## review_scores_rating 1.0642 0.1782 5.972
## factor(neighbourhood_group_cleansed)Honolulu 22.4852 2.9511 7.619
## factor(neighbourhood_group_cleansed)Kauai 79.0343 3.6401 21.712
## factor(neighbourhood_group_cleansed)Maui 87.8281 2.9487 29.786
## accommodates 40.1617 0.5096 78.815
## host_since_long 0.2694 0.5507 0.489
## Pr(>|t|)
## (Intercept) 0.0000000062970396 ***
## factor(host_is_superhost)t 0.00198 **
## review_scores_rating 0.0000000024092542 ***
## factor(neighbourhood_group_cleansed)Honolulu 0.0000000000000274 ***
## factor(neighbourhood_group_cleansed)Kauai < 2e-16 ***
## factor(neighbourhood_group_cleansed)Maui < 2e-16 ***
## accommodates < 2e-16 ***
## host_since_long 0.62462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122.2 on 12462 degrees of freedom
## (57 observations deleted due to missingness)
## Multiple R-squared: 0.3866, Adjusted R-squared: 0.3862
## F-statistic: 1122 on 7 and 12462 DF, p-value: < 2.2e-16
Count1 <- df1 %>%
group_by(neighbourhood_group_cleansed) %>%
summarise(count = n())
medpri <- df1 %>%
group_by(neighbourhood_group_cleansed) %>%
summarise(meanPrice = median(price), sumRevenueALL = sum(price))
mod1 <- merge(Count1, medpri, by = c("neighbourhood_group_cleansed"))
mod1
## neighbourhood_group_cleansed count meanPrice sumRevenueALL
## 1 Hawaii 3256 125 580130
## 2 Honolulu 3802 137 697000
## 3 Kauai 1756 220 474987
## 4 Maui 3713 225 988990
df1 %>%
filter(HostStatus == "Superhost"& review_scores_rating > 96 & host_response_rate > 90) %>%
dplyr::select(HostStatus, review_scores_rating, host_response_rate) %>%
group_by(HostStatus) %>%
summarise(count = n())
## # A tibble: 1 x 2
## HostStatus count
## <chr> <int>
## 1 Superhost 3737
df1 %>%
filter(HostStatus == "Host") %>%
dplyr::select(HostStatus, review_scores_rating, host_response_rate) %>%
group_by(HostStatus) %>%
summarise(count = n())
## # A tibble: 1 x 2
## HostStatus count
## <chr> <int>
## 1 Host 6885
df1 %>%
dplyr::select(price, HostStatus) %>%
group_by(HostStatus) %>%
summarise(med_price = median(price), count = n())
## # A tibble: 2 x 3
## HostStatus med_price count
## <chr> <dbl> <int>
## 1 Host 180 6885
## 2 Superhost 165 5642
df_sh <- df1 %>%
filter(HostStatus == "Superhost")
df_h <- df1 %>%
filter(HostStatus == "Host")
t.test(df_h$price, df_sh$price, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: df_h$price and df_sh$price
## t = 6.0834, df = 12279, p-value = 0.000000001212
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## 9.79513 24.18542
## sample estimates:
## mean of x mean of y
## 226.4681 209.4778
df_sh_hon <- df1 %>%
filter(HostStatus == "Superhost" & neighbourhood_group_cleansed == "Honolulu" )
df_h_maui <- df1 %>%
filter(HostStatus == "Host" & neighbourhood_group_cleansed == "Maui")
t.test(df_h_maui$price, df_sh_hon$price, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: df_h_maui$price and df_sh_hon$price
## t = 18.471, df = 3467.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## 77.54635 102.69474
## sample estimates:
## mean of x mean of y
## 273.1263 183.0057
df_sh_rev <- df1 %>%
filter(HostStatus == "Superhost" )
df_h_rev <- df1 %>%
filter(HostStatus == "Host")
t.test(df_sh_rev$number_of_reviews, df_h_rev$number_of_reviews, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: df_sh_rev$number_of_reviews and df_h_rev$number_of_reviews
## t = 24.757, df = 9731.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 22.50023 26.36971
## sample estimates:
## mean of x mean of y
## 56.30876 31.87378
df_sh_rev <- df1 %>%
filter(HostStatus == "Superhost" )
df_h_rev <- df1 %>%
filter(HostStatus == "Host")
t.test(df_sh_rev $number_of_reviews, conf.level = 0.95)
##
## One Sample t-test
##
## data: df_sh_rev$number_of_reviews
## t = 67.377, df = 5641, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 54.67040 57.94711
## sample estimates:
## mean of x
## 56.30876
df_sh <- df1 %>%
filter(HostStatus == "Superhost" & neighbourhood_group_cleansed == "Honolulu" )
df_h <- df1 %>%
filter(HostStatus == "Host" & neighbourhood_group_cleansed == "Maui" )
t.test(df_sh$square_feet, df_h$square_feet, conf.level = 0.90)
##
## Welch Two Sample t-test
##
## data: df_sh$square_feet and df_h$square_feet
## t = 1.0364, df = 8.1974, p-value = 0.3296
## alternative hypothesis: true difference in means is not equal to 0
## 90 percent confidence interval:
## -241.7619 854.8333
## sample estimates:
## mean of x mean of y
## 1046.7500 740.2143
df_sh <- df1 %>%
filter(HostStatus == "Superhost" )
df_h<- df1 %>%
filter(HostStatus == "Host")
t.test(df_sh$review_scores_rating, df_h$review_scores_rating, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: df_sh$review_scores_rating and df_h$review_scores_rating
## t = 43.28, df = 9451.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## 4.216194 4.749925
## sample estimates:
## mean of x mean of y
## 97.15319 92.67013
df_sh <- df1 %>%
filter(HostStatus == "Superhost" )
df_h<- df1 %>%
filter(HostStatus == "Host")
t.test(df_sh$host_response_rate, df_h$host_response_rate, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: df_sh$host_response_rate and df_h$host_response_rate
## t = 25.079, df = 9394.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## 3.354901 4.123128
## sample estimates:
## mean of x mean of y
## 98.90677 95.16776
df_t <- df1 %>%
filter(review_scores_rating >= 97 & host_response_rate >= 90)
t.test(df_t$price, conf.level = 0.95)
##
## One Sample t-test
##
## data: df_t$price
## t = 108.06, df = 5720, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 230.0809 238.5829
## sample estimates:
## mean of x
## 234.3319
df_sh <- df1 %>%
filter(HostStatus == "Superhost" )
t.test(df_sh$price, conf.level = 0.95)
##
## One Sample t-test
##
## data: df_sh$price
## t = 103.93, df = 5641, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 205.5266 213.4291
## sample estimates:
## mean of x
## 209.4778
df_t <- df1 %>%
filter(review_scores_rating >= 97 & host_response_rate >= 90)
df_sh <- df1 %>%
filter(HostStatus == "Superhost" )
t.test(df_t$price, df_sh$price, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: df_t$price and df_sh$price
## t = 8.3952, df = 11312, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 19.05097 30.65721
## sample estimates:
## mean of x mean of y
## 234.3319 209.4778
t.test(df1$price, conf.level = 0.95)
##
## One Sample t-test
##
## data: df1$price
## t = 156.35, df = 12526, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 216.0726 221.5592
## sample estimates:
## mean of x
## 218.8159